In [31]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('float_format', '{:f}'.format)
pd.set_option('display.max_columns', None)
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import norm
import pandas_profiling
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
np.random.seed(sum(map(ord, "aesthetics")))
from sklearn import metrics
In [5]:
import seaborn as sns
In [3]:
#loading csv
iris_dataset = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/iris-species/Iris.csv')
iris_dataset.head()
Out[3]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

Question 1 :

In [6]:
#1 Histograms without labels for all 4 dimensions
f, (ax1,ax2,ax3,ax4) = plt.subplots(1, 4, figsize=(20,4))
    
ax1.set_title('SepalLengthCm')
ax2.set_title('SepalWidthCm')
ax3.set_title('PetalLengthCm')
ax4.set_title('PetalWidthCm')
    
sns.distplot(iris_dataset['SepalLengthCm'],ax=ax1)   
sns.distplot(iris_dataset['SepalWidthCm'],ax=ax2) 
sns.distplot(iris_dataset['PetalLengthCm'],ax=ax3) 
sns.distplot(iris_dataset['PetalWidthCm'],ax=ax4) 
    
plt.show()
In [7]:
#1 Histograms with labels : Species for all 4 dimensions
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "SepalLengthCm")
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "SepalWidthCm")
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "PetalLengthCm")
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "PetalWidthCm")

Summary Q1 :

  • Looking at the above histograms with class lables we can see that the PetalWidthCm clearly distinguish the three species best.
  • We can see this based on the differnt bins of the histograms for the three species for PetalWidhtCm, which are cleary seperate which indicates that we can distinguish the species by PetalWidhtCm range of values.
  • The range for PetalWidhtCm is as below for the three different species :

PetalWidthCm

    * Iris-setosa :  0.1 to 0.6
    * Iris-versicolor : 1.0 to 1.8
    * Iris-virginica : 1.4 to 2.5

PetalLengthCm is not the candidate as there is some overlap between the values for Iris-versicolor and Iris-virginica, hence its is not the one

  • PetalLengthCm
      * Iris-setosa : 0.1 to 1.9
      * Iris-versicolor : 3.0 to 5.1
      * Iris-virginica : 4.5 to 6.9
In [7]:
#Extra distribution
#pandas_profiling.ProfileReport(iris_dataset)
#Extra
Out[7]:

Overview

Dataset info

Number of variables 6
Number of observations 150
Total Missing (%) 0.0%
Total size in memory 7.1 KiB
Average record size in memory 48.5 B

Variables types

Numeric 4
Categorical 1
Boolean 0
Date 0
Text (Unique) 0
Rejected 1
Unsupported 0

Warnings

Variables

Id
Numeric

Distinct count 150
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 75.5
Minimum 1
Maximum 150
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 8.45
Q1 38.25
Median 75.5
Q3 112.75
95-th percentile 142.55
Maximum 150
Range 149
Interquartile range 74.5

Descriptive statistics

Standard deviation 43.445
Coef of variation 0.57544
Kurtosis -1.2
Mean 75.5
MAD 37.5
Skewness 0
Sum 11325
Variance 1887.5
Memory size 1.2 KiB
Value Count Frequency (%)  
150 1 0.7%
 
56 1 0.7%
 
54 1 0.7%
 
53 1 0.7%
 
52 1 0.7%
 
51 1 0.7%
 
50 1 0.7%
 
49 1 0.7%
 
48 1 0.7%
 
47 1 0.7%
 
Other values (140) 140 93.3%
 

Minimum 5 values

Value Count Frequency (%)  
1 1 0.7%
 
2 1 0.7%
 
3 1 0.7%
 
4 1 0.7%
 
5 1 0.7%
 

Maximum 5 values

Value Count Frequency (%)  
146 1 0.7%
 
147 1 0.7%
 
148 1 0.7%
 
149 1 0.7%
 
150 1 0.7%
 

PetalLengthCm
Numeric

Distinct count 43
Unique (%) 28.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.7587
Minimum 1
Maximum 6.9
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1.3
Q1 1.6
Median 4.35
Q3 5.1
95-th percentile 6.1
Maximum 6.9
Range 5.9
Interquartile range 3.5

Descriptive statistics

Standard deviation 1.7644
Coef of variation 0.46943
Kurtosis -1.4019
Mean 3.7587
MAD 1.5619
Skewness -0.27446
Sum 563.8
Variance 3.1132
Memory size 1.2 KiB
Value Count Frequency (%)  
1.5 14 9.3%
 
1.4 12 8.0%
 
5.1 8 5.3%
 
4.5 8 5.3%
 
1.3 7 4.7%
 
1.6 7 4.7%
 
5.6 6 4.0%
 
4.0 5 3.3%
 
4.9 5 3.3%
 
4.7 5 3.3%
 
Other values (33) 73 48.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1 0.7%
 
1.1 1 0.7%
 
1.2 2 1.3%
 
1.3 7 4.7%
 
1.4 12 8.0%
 

Maximum 5 values

Value Count Frequency (%)  
6.3 1 0.7%
 
6.4 1 0.7%
 
6.6 1 0.7%
 
6.7 2 1.3%
 
6.9 1 0.7%
 

PetalWidthCm
Highly correlated

This variable is highly correlated with PetalLengthCm and should be ignored for analysis

Correlation 0.96276

SepalLengthCm
Numeric

Distinct count 35
Unique (%) 23.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.8433
Minimum 4.3
Maximum 7.9
Zeros (%) 0.0%

Quantile statistics

Minimum 4.3
5-th percentile 4.6
Q1 5.1
Median 5.8
Q3 6.4
95-th percentile 7.255
Maximum 7.9
Range 3.6
Interquartile range 1.3

Descriptive statistics

Standard deviation 0.82807
Coef of variation 0.14171
Kurtosis -0.55206
Mean 5.8433
MAD 0.68756
Skewness 0.31491
Sum 876.5
Variance 0.68569
Memory size 1.2 KiB
Value Count Frequency (%)  
5.0 10 6.7%
 
6.3 9 6.0%
 
5.1 9 6.0%
 
6.7 8 5.3%
 
5.7 8 5.3%
 
5.5 7 4.7%
 
5.8 7 4.7%
 
6.4 7 4.7%
 
6.0 6 4.0%
 
4.9 6 4.0%
 
Other values (25) 73 48.7%
 

Minimum 5 values

Value Count Frequency (%)  
4.3 1 0.7%
 
4.4 3 2.0%
 
4.5 1 0.7%
 
4.6 4 2.7%
 
4.7 2 1.3%
 

Maximum 5 values

Value Count Frequency (%)  
7.3 1 0.7%
 
7.4 1 0.7%
 
7.6 1 0.7%
 
7.7 4 2.7%
 
7.9 1 0.7%
 

SepalWidthCm
Numeric

Distinct count 23
Unique (%) 15.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.054
Minimum 2
Maximum 4.4
Zeros (%) 0.0%

Quantile statistics

Minimum 2
5-th percentile 2.345
Q1 2.8
Median 3
Q3 3.3
95-th percentile 3.8
Maximum 4.4
Range 2.4
Interquartile range 0.5

Descriptive statistics

Standard deviation 0.43359
Coef of variation 0.14198
Kurtosis 0.29078
Mean 3.054
MAD 0.33309
Skewness 0.33405
Sum 458.1
Variance 0.188
Memory size 1.2 KiB
Value Count Frequency (%)  
3.0 26 17.3%
 
2.8 14 9.3%
 
3.2 13 8.7%
 
3.4 12 8.0%
 
3.1 12 8.0%
 
2.9 10 6.7%
 
2.7 9 6.0%
 
2.5 8 5.3%
 
3.5 6 4.0%
 
3.8 6 4.0%
 
Other values (13) 34 22.7%
 

Minimum 5 values

Value Count Frequency (%)  
2.0 1 0.7%
 
2.2 3 2.0%
 
2.3 4 2.7%
 
2.4 3 2.0%
 
2.5 8 5.3%
 

Maximum 5 values

Value Count Frequency (%)  
3.9 2 1.3%
 
4.0 1 0.7%
 
4.1 1 0.7%
 
4.2 1 0.7%
 
4.4 1 0.7%
 

Species
Categorical

Distinct count 3
Unique (%) 2.0%
Missing (%) 0.0%
Missing (n) 0
Iris-versicolor
50
Iris-virginica
50
Iris-setosa
50
Value Count Frequency (%)  
Iris-versicolor 50 33.3%
 
Iris-virginica 50 33.3%
 
Iris-setosa 50 33.3%
 

Correlations

Sample

Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

Question 2 :

In [11]:
#loading csv
#2
boson_dataset = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/higgs-boson/training.csv')
boson_dataset.head()
Out[11]:
EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot ... PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt Weight Label
0 100000 138.470 51.655 97.827 27.980 0.91 124.711 2.666 3.064 41.928 ... 2 67.435 2.150 0.444 46.062 1.24 -2.475 113.497 0.002653 s
1 100001 160.937 68.768 103.235 48.146 -999.00 -999.000 -999.000 3.473 2.078 ... 1 46.226 0.725 1.158 -999.000 -999.00 -999.000 46.226 2.233584 b
2 100002 -999.000 162.172 125.953 35.635 -999.00 -999.000 -999.000 3.148 9.336 ... 1 44.251 2.053 -2.028 -999.000 -999.00 -999.000 44.251 2.347389 b
3 100003 143.905 81.417 80.943 0.414 -999.00 -999.000 -999.000 3.310 0.414 ... 0 -999.000 -999.000 -999.000 -999.000 -999.00 -999.000 -0.000 5.446378 b
4 100004 175.864 16.915 134.805 16.405 -999.00 -999.000 -999.000 3.891 16.405 ... 0 -999.000 -999.000 -999.000 -999.000 -999.00 -999.000 0.000 6.245333 b

5 rows × 33 columns

In [12]:
#loading csv
boson_dataset_test = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/higgs-boson/test.csv')
boson_dataset_test.head()
Out[12]:
EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot ... PRI_met_phi PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt
0 350000 -999.000 79.589 23.916 3.036 -999.000 -999.000 -999.000 0.903 3.036 ... 2.022 98.556 0 -999.000 -999.000 -999.000 -999.000 -999.000 -999.000 -0.000
1 350001 106.398 67.490 87.949 49.994 -999.000 -999.000 -999.000 2.048 2.679 ... -1.138 176.251 1 47.575 -0.553 -0.849 -999.000 -999.000 -999.000 47.575
2 350002 117.794 56.226 96.358 4.137 -999.000 -999.000 -999.000 2.755 4.137 ... -1.868 111.505 0 -999.000 -999.000 -999.000 -999.000 -999.000 -999.000 0.000
3 350003 135.861 30.604 97.288 9.104 -999.000 -999.000 -999.000 2.811 9.104 ... 1.172 164.707 0 -999.000 -999.000 -999.000 -999.000 -999.000 -999.000 0.000
4 350004 74.159 82.772 58.731 89.646 1.347 536.663 -0.339 1.028 77.213 ... -0.231 869.614 3 254.085 -1.013 -0.334 185.857 0.335 2.587 599.213

5 rows × 31 columns

In [13]:
boson_dataset.columns.size
Out[13]:
33
In [15]:
boson_dataset.Label.unique()
Out[15]:
array(['s', 'b'], dtype=object)
In [20]:
print(boson_dataset[boson_dataset.columns.difference(['EventId', 'Label'])].columns)
Index(['DER_deltaeta_jet_jet', 'DER_deltar_tau_lep', 'DER_lep_eta_centrality',
       'DER_mass_MMC', 'DER_mass_jet_jet', 'DER_mass_transverse_met_lep',
       'DER_mass_vis', 'DER_met_phi_centrality', 'DER_prodeta_jet_jet',
       'DER_pt_h', 'DER_pt_ratio_lep_tau', 'DER_pt_tot', 'DER_sum_pt',
       'PRI_jet_all_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
       'PRI_jet_leading_pt', 'PRI_jet_num', 'PRI_jet_subleading_eta',
       'PRI_jet_subleading_phi', 'PRI_jet_subleading_pt', 'PRI_lep_eta',
       'PRI_lep_phi', 'PRI_lep_pt', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet',
       'PRI_tau_eta', 'PRI_tau_phi', 'PRI_tau_pt', 'Weight'],
      dtype='object')
In [176]:
#pandas_profiling.ProfileReport(boson_dataset)
Out[176]:

Overview

Dataset info

Number of variables 37
Number of observations 250000
Total Missing (%) 0.0%
Total size in memory 70.6 MiB
Average record size in memory 296.0 B

Variables types

Numeric 23
Categorical 1
Boolean 0
Date 0
Text (Unique) 0
Rejected 13
Unsupported 0

Warnings

Variables

DER_deltaeta_jet_jet
Numeric

Distinct count 7087
Unique (%) 2.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -708.42
Minimum -999
Maximum 8.503
Zeros (%) 0.0%

Quantile statistics

Minimum -999
5-th percentile -999
Q1 -999
Median -999
Q3 0.49
95-th percentile 4.276
Maximum 8.503
Range 1007.5
Interquartile range 999.49

Descriptive statistics

Standard deviation 454.48
Coef of variation -0.64154
Kurtosis -1.1449
Mean -708.42
MAD 412.52
Skewness 0.92469
Sum -177110000
Variance 206550
Memory size 1.9 MiB
Value Count Frequency (%)  
-999.0 177457 71.0%
 
0.326 33 0.0%
 
0.43200000000000005 32 0.0%
 
0.5329999999999999 32 0.0%
 
0.574 32 0.0%
 
1.2930000000000001 32 0.0%
 
0.254 32 0.0%
 
0.792 32 0.0%
 
0.408 32 0.0%
 
0.087 31 0.0%
 
Other values (7077) 72255 28.9%
 

Minimum 5 values

Value Count Frequency (%)  
-999.0 177457 71.0%
 
0.0 6 0.0%
 
0.001 20 0.0%
 
0.002 28 0.0%
 
0.003 23 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
8.287 1 0.0%
 
8.301 1 0.0%
 
8.326 1 0.0%
 
8.459 1 0.0%
 
8.503 1 0.0%
 

DER_deltar_tau_lep
Numeric

Distinct count 4692
Unique (%) 1.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.3731
Minimum 0.208
Maximum 5.684
Zeros (%) 0.0%

Quantile statistics

Minimum 0.208
5-th percentile 0.973
Q1 1.81
Median 2.4915
Q3 2.961
95-th percentile 3.441
Maximum 5.684
Range 5.476
Interquartile range 1.151

Descriptive statistics

Standard deviation 0.78291
Coef of variation 0.32991
Kurtosis -0.22245
Mean 2.3731
MAD 0.64234
Skewness -0.21578
Sum 593270
Variance 0.61295
Memory size 1.9 MiB
Value Count Frequency (%)  
3.094 202 0.1%
 
3.1310000000000002 201 0.1%
 
3.0780000000000003 199 0.1%
 
3.117 197 0.1%
 
2.904 194 0.1%
 
3.0980000000000003 193 0.1%
 
3.133 193 0.1%
 
3.12 193 0.1%
 
3.1069999999999998 191 0.1%
 
3.1 190 0.1%
 
Other values (4682) 248047 99.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.20800000000000002 1 0.0%
 
0.22399999999999998 1 0.0%
 
0.228 1 0.0%
 
0.24600000000000002 1 0.0%
 
0.256 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5.624 1 0.0%
 
5.626 1 0.0%
 
5.642 1 0.0%
 
5.655 1 0.0%
 
5.684 1 0.0%
 

DER_lep_eta_centrality
Highly correlated

This variable is highly correlated with DER_prodeta_jet_jet and should be ignored for analysis

Correlation 0.99999

DER_mass_MMC
Numeric

Distinct count 108338
Unique (%) 43.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -49.023
Minimum -999
Maximum 1192
Zeros (%) 0.0%

Quantile statistics

Minimum -999
5-th percentile -999
Q1 78.101
Median 105.01
Q3 130.61
95-th percentile 201.81
Maximum 1192
Range 2191
Interquartile range 52.505

Descriptive statistics

Standard deviation 406.35
Coef of variation -8.2889
Kurtosis 1.6242
Mean -49.023
MAD 289.66
Skewness -1.8547
Sum -12256000
Variance 165120
Memory size 1.9 MiB
Value Count Frequency (%)  
-999.0 38114 15.2%
 
121.26100000000001 10 0.0%
 
113.965 10 0.0%
 
125.46600000000001 10 0.0%
 
108.914 10 0.0%
 
132.292 10 0.0%
 
108.95700000000001 10 0.0%
 
103.762 10 0.0%
 
96.819 10 0.0%
 
111.12299999999999 10 0.0%
 
Other values (108328) 211796 84.7%
 

Minimum 5 values

Value Count Frequency (%)  
-999.0 38114 15.2%
 
9.044 1 0.0%
 
9.222000000000001 1 0.0%
 
9.652999999999999 1 0.0%
 
9.806000000000001 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
980.192 1 0.0%
 
985.102 1 0.0%
 
987.561 1 0.0%
 
988.199 1 0.0%
 
1192.026 1 0.0%
 

DER_mass_jet_jet
Highly correlated

This variable is highly correlated with DER_deltaeta_jet_jet and should be ignored for analysis

Correlation 0.94604

DER_mass_transverse_met_lep
Numeric

Distinct count 101637
Unique (%) 40.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 49.24
Minimum 0
Maximum 690.08
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 3.389
Q1 19.241
Median 46.524
Q3 73.598
95-th percentile 104.64
Maximum 690.08
Range 690.08
Interquartile range 54.357

Descriptive statistics

Standard deviation 35.345
Coef of variation 0.71781
Kurtosis 6.3668
Mean 49.24
MAD 28.607
Skewness 1.2192
Sum 12310000
Variance 1249.3
Memory size 1.9 MiB
Value Count Frequency (%)  
1.8 13 0.0%
 
9.362 12 0.0%
 
2.5 12 0.0%
 
3.1830000000000003 12 0.0%
 
3.5010000000000003 12 0.0%
 
0.113 11 0.0%
 
11.530999999999999 11 0.0%
 
8.967 11 0.0%
 
6.335 11 0.0%
 
11.089 11 0.0%
 
Other values (101627) 249884 100.0%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3 0.0%
 
0.001 1 0.0%
 
0.002 4 0.0%
 
0.003 4 0.0%
 
0.004 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
570.115 1 0.0%
 
571.868 1 0.0%
 
594.2869999999999 1 0.0%
 
595.819 1 0.0%
 
690.075 1 0.0%
 

DER_mass_vis
Numeric

Distinct count 100558
Unique (%) 40.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 81.182
Minimum 6.329
Maximum 1349.4
Zeros (%) 0.0%

Quantile statistics

Minimum 6.329
5-th percentile 37.874
Q1 59.389
Median 73.752
Q3 92.259
95-th percentile 149.27
Maximum 1349.4
Range 1343
Interquartile range 32.87

Descriptive statistics

Standard deviation 40.829
Coef of variation 0.50293
Kurtosis 35.494
Mean 81.182
MAD 25.454
Skewness 3.7903
Sum 20295000
Variance 1667
Memory size 1.9 MiB
Value Count Frequency (%)  
76.819 16 0.0%
 
61.286 15 0.0%
 
70.41199999999999 14 0.0%
 
63.648 13 0.0%
 
68.039 13 0.0%
 
71.097 13 0.0%
 
59.87 13 0.0%
 
68.752 13 0.0%
 
62.044 13 0.0%
 
79.36 13 0.0%
 
Other values (100548) 249864 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
6.329 1 0.0%
 
6.462000000000001 1 0.0%
 
7.12 1 0.0%
 
7.202000000000001 1 0.0%
 
7.33 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1034.205 1 0.0%
 
1051.358 1 0.0%
 
1153.1660000000002 1 0.0%
 
1329.9129999999998 1 0.0%
 
1349.351 1 0.0%
 

DER_mass_vis_log
Highly correlated

This variable is highly correlated with DER_mass_vis and should be ignored for analysis

Correlation 0.90108

DER_met_phi_centrality
Numeric

Distinct count 2829
Unique (%) 1.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.1283
Minimum -1.414
Maximum 1.414
Zeros (%) 0.0%

Quantile statistics

Minimum -1.414
5-th percentile -1.413
Q1 -1.371
Median -0.356
Q3 1.225
95-th percentile 1.412
Maximum 1.414
Range 2.828
Interquartile range 2.596

Descriptive statistics

Standard deviation 1.1936
Coef of variation -9.3027
Kurtosis -1.7681
Mean -0.1283
MAD 1.1308
Skewness 0.15114
Sum -32076
Variance 1.4246
Memory size 1.9 MiB
Value Count Frequency (%)  
-1.4140000000000001 11429 4.6%
 
1.4140000000000001 7778 3.1%
 
-1.413 5227 2.1%
 
-1.412 3514 1.4%
 
1.413 3345 1.3%
 
-1.411 2820 1.1%
 
-1.41 2341 0.9%
 
1.412 2205 0.9%
 
-1.409 2178 0.9%
 
-1.4080000000000001 1929 0.8%
 
Other values (2819) 207234 82.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1.4140000000000001 11429 4.6%
 
-1.413 5227 2.1%
 
-1.412 3514 1.4%
 
-1.411 2820 1.1%
 
-1.41 2341 0.9%
 

Maximum 5 values

Value Count Frequency (%)  
1.41 1450 0.6%
 
1.411 1724 0.7%
 
1.412 2205 0.9%
 
1.413 3345 1.3%
 
1.4140000000000001 7778 3.1%
 

DER_prodeta_jet_jet
Highly correlated

This variable is highly correlated with DER_mass_jet_jet and should be ignored for analysis

Correlation 0.94444

DER_pt_h
Numeric

Distinct count 115563
Unique (%) 46.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 57.896
Minimum 0
Maximum 2835
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 1.201
Q1 14.069
Median 38.468
Q3 79.169
95-th percentile 183.49
Maximum 2835
Range 2835
Interquartile range 65.1

Descriptive statistics

Standard deviation 63.656
Coef of variation 1.0995
Kurtosis 22.028
Mean 57.896
MAD 45.406
Skewness 2.5419
Sum 14474000
Variance 4052
Memory size 1.9 MiB
Value Count Frequency (%)  
0.0 41 0.0%
 
1.308 25 0.0%
 
0.778 25 0.0%
 
1.8430000000000002 25 0.0%
 
0.6970000000000001 25 0.0%
 
0.763 24 0.0%
 
1.319 23 0.0%
 
1.094 23 0.0%
 
1.4340000000000002 23 0.0%
 
0.654 22 0.0%
 
Other values (115553) 249744 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 41 0.0%
 
0.005 1 0.0%
 
0.011000000000000001 1 0.0%
 
0.012 2 0.0%
 
0.013999999999999999 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
734.2769999999999 1 0.0%
 
753.745 1 0.0%
 
762.806 1 0.0%
 
1053.807 1 0.0%
 
2834.9990000000003 1 0.0%
 

DER_pt_ratio_lep_tau
Numeric

Distinct count 5931
Unique (%) 2.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.4376
Minimum 0.047
Maximum 19.773
Zeros (%) 0.0%

Quantile statistics

Minimum 0.047
5-th percentile 0.488
Q1 0.883
Median 1.28
Q3 1.777
95-th percentile 2.897
Maximum 19.773
Range 19.726
Interquartile range 0.894

Descriptive statistics

Standard deviation 0.84474
Coef of variation 0.5876
Kurtosis 18.297
Mean 1.4376
MAD 0.59245
Skewness 2.6335
Sum 359400
Variance 0.71359
Memory size 1.9 MiB
Value Count Frequency (%)  
0.9009999999999999 206 0.1%
 
1.128 198 0.1%
 
1.232 197 0.1%
 
0.9540000000000001 196 0.1%
 
1.249 195 0.1%
 
1.2830000000000001 193 0.1%
 
1.149 187 0.1%
 
1.2670000000000001 185 0.1%
 
1.198 184 0.1%
 
1.155 184 0.1%
 
Other values (5921) 248075 99.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.047 1 0.0%
 
0.07400000000000001 1 0.0%
 
0.077 1 0.0%
 
0.08 2 0.0%
 
0.081 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
16.776 1 0.0%
 
18.872 1 0.0%
 
18.992 1 0.0%
 
19.672 1 0.0%
 
19.773 1 0.0%
 

DER_pt_ratio_lep_tau_log
Highly correlated

This variable is highly correlated with DER_pt_ratio_lep_tau and should be ignored for analysis

Correlation 0.90512

DER_pt_tot
Numeric

Distinct count 59042
Unique (%) 23.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 18.917
Minimum 0
Maximum 2835
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 0.733
Q1 2.841
Median 12.316
Q3 27.591
95-th percentile 56.689
Maximum 2835
Range 2835
Interquartile range 24.75

Descriptive statistics

Standard deviation 22.273
Coef of variation 1.1774
Kurtosis 1036.5
Mean 18.917
MAD 15.649
Skewness 10.579
Sum 4729300
Variance 496.11
Memory size 1.9 MiB
Value Count Frequency (%)  
1.072 44 0.0%
 
0.9640000000000001 43 0.0%
 
1.2819999999999998 43 0.0%
 
0.851 41 0.0%
 
1.308 41 0.0%
 
1.5030000000000001 40 0.0%
 
1.693 40 0.0%
 
0.892 40 0.0%
 
1.26 40 0.0%
 
1.3619999999999999 39 0.0%
 
Other values (59032) 249589 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 39 0.0%
 
0.001 2 0.0%
 
0.003 1 0.0%
 
0.004 2 0.0%
 
0.005 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
372.721 1 0.0%
 
403.195 1 0.0%
 
466.525 1 0.0%
 
513.659 1 0.0%
 
2834.9990000000003 1 0.0%
 

DER_sum_pt
Numeric

Distinct count 156098
Unique (%) 62.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 158.43
Minimum 46.104
Maximum 1852.5
Zeros (%) 0.0%

Quantile statistics

Minimum 46.104
5-th percentile 55.973
Q1 77.55
Median 120.66
Q3 200.48
95-th percentile 383
Maximum 1852.5
Range 1806.4
Interquartile range 122.93

Descriptive statistics

Standard deviation 115.71
Coef of variation 0.73032
Kurtosis 8.8372
Mean 158.43
MAD 83.451
Skewness 2.3206
Sum 39608000
Variance 13388
Memory size 1.9 MiB
Value Count Frequency (%)  
76.854 13 0.0%
 
68.48100000000001 12 0.0%
 
64.523 11 0.0%
 
66.18 10 0.0%
 
69.8 10 0.0%
 
64.673 10 0.0%
 
77.59 9 0.0%
 
69.54 9 0.0%
 
63.293 9 0.0%
 
75.976 9 0.0%
 
Other values (156088) 249898 100.0%
 

Minimum 5 values

Value Count Frequency (%)  
46.104 1 0.0%
 
46.211999999999996 1 0.0%
 
46.227 1 0.0%
 
46.229 1 0.0%
 
46.25 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1558.993 1 0.0%
 
1675.4489999999998 1 0.0%
 
1687.0870000000002 1 0.0%
 
1703.7520000000002 1 0.0%
 
1852.4620000000002 1 0.0%
 

EventId
Numeric

Distinct count 250000
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 225000
Minimum 100000
Maximum 349999
Zeros (%) 0.0%

Quantile statistics

Minimum 100000
5-th percentile 112500
Q1 162500
Median 225000
Q3 287500
95-th percentile 337500
Maximum 349999
Range 249999
Interquartile range 125000

Descriptive statistics

Standard deviation 72169
Coef of variation 0.32075
Kurtosis -1.2
Mean 225000
MAD 62500
Skewness 0
Sum 56249875000
Variance 5208400000
Memory size 1.9 MiB
Value Count Frequency (%)  
100303 1 0.0%
 
142096 1 0.0%
 
187150 1 0.0%
 
181005 1 0.0%
 
183052 1 0.0%
 
193291 1 0.0%
 
195338 1 0.0%
 
189193 1 0.0%
 
191240 1 0.0%
 
168711 1 0.0%
 
Other values (249990) 249990 100.0%
 

Minimum 5 values

Value Count Frequency (%)  
100000 1 0.0%
 
100001 1 0.0%
 
100002 1 0.0%
 
100003 1 0.0%
 
100004 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
349995 1 0.0%
 
349996 1 0.0%
 
349997 1 0.0%
 
349998 1 0.0%
 
349999 1 0.0%
 

Label
Categorical

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
b
164333
s
85667
Value Count Frequency (%)  
b 164333 65.7%
 
s 85667 34.3%
 

PRI_jet_all_pt
Highly correlated

This variable is highly correlated with DER_sum_pt and should be ignored for analysis

Correlation 0.96563

PRI_jet_leading_eta
Highly correlated

This variable is highly correlated with PRI_jet_leading_pt and should be ignored for analysis

Correlation 0.9961

PRI_jet_leading_phi
Highly correlated

This variable is highly correlated with PRI_jet_leading_eta and should be ignored for analysis

Correlation 0.99999

PRI_jet_leading_pt
Numeric

Distinct count 86590
Unique (%) 34.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -348.33
Minimum -999
Maximum 1120.6
Zeros (%) 0.0%

Quantile statistics

Minimum -999
5-th percentile -999
Q1 -999
Median 38.96
Q3 75.349
95-th percentile 169.46
Maximum 1120.6
Range 2119.6
Interquartile range 1074.3

Descriptive statistics

Standard deviation 532.96
Coef of variation -1.5301
Kurtosis -1.8107
Mean -348.33
MAD 520.08
Skewness -0.38373
Sum -87082000
Variance 284050
Memory size 1.9 MiB
Value Count Frequency (%)  
-999.0 99913 40.0%
 
40.089 10 0.0%
 
36.493 10 0.0%
 
30.763 10 0.0%
 
36.358000000000004 10 0.0%
 
30.363000000000003 10 0.0%
 
31.739 10 0.0%
 
34.184 10 0.0%
 
35.656 9 0.0%
 
34.275 9 0.0%
 
Other values (86580) 149999 60.0%
 

Minimum 5 values

Value Count Frequency (%)  
-999.0 99913 40.0%
 
30.0 3 0.0%
 
30.000999999999998 2 0.0%
 
30.002 6 0.0%
 
30.003 5 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
738.235 1 0.0%
 
743.222 1 0.0%
 
755.235 1 0.0%
 
760.846 1 0.0%
 
1120.573 1 0.0%
 

PRI_jet_num
Numeric

Distinct count 4
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.97918
Minimum 0
Maximum 3
Zeros (%) 40.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 1
Q3 2
95-th percentile 3
Maximum 3
Range 3
Interquartile range 2

Descriptive statistics

Standard deviation 0.97743
Coef of variation 0.99821
Kurtosis -0.7378
Mean 0.97918
MAD 0.78266
Skewness 0.61128
Sum 244794
Variance 0.95536
Memory size 1.9 MiB
Value Count Frequency (%)  
0 99913 40.0%
 
1 77544 31.0%
 
2 50379 20.2%
 
3 22164 8.9%
 

Minimum 5 values

Value Count Frequency (%)  
0 99913 40.0%
 
1 77544 31.0%
 
2 50379 20.2%
 
3 22164 8.9%
 

Maximum 5 values

Value Count Frequency (%)  
0 99913 40.0%
 
1 77544 31.0%
 
2 50379 20.2%
 
3 22164 8.9%
 

PRI_jet_subleading_eta
Highly correlated

This variable is highly correlated with PRI_jet_subleading_pt and should be ignored for analysis

Correlation 0.99935

PRI_jet_subleading_phi
Highly correlated

This variable is highly correlated with PRI_jet_subleading_eta and should be ignored for analysis

Correlation 0.99999

PRI_jet_subleading_pt
Highly correlated

This variable is highly correlated with DER_lep_eta_centrality and should be ignored for analysis

Correlation 0.99935

PRI_lep_eta
Numeric

Distinct count 4987
Unique (%) 2.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.019507
Minimum -2.505
Maximum 2.503
Zeros (%) 0.0%

Quantile statistics

Minimum -2.505
5-th percentile -2.072
Q1 -1.014
Median -0.045
Q3 0.959
95-th percentile 2.066
Maximum 2.503
Range 5.008
Interquartile range 1.973

Descriptive statistics

Standard deviation 1.265
Coef of variation -64.846
Kurtosis -0.95698
Mean -0.019507
MAD 1.0698
Skewness 0.021623
Sum -4876.9
Variance 1.6002
Memory size 1.9 MiB
Value Count Frequency (%)  
0.307 106 0.0%
 
-0.20600000000000002 105 0.0%
 
0.392 95 0.0%
 
0.364 94 0.0%
 
-0.629 94 0.0%
 
0.524 92 0.0%
 
-0.158 91 0.0%
 
0.335 90 0.0%
 
-0.644 90 0.0%
 
0.34 90 0.0%
 
Other values (4977) 249053 99.6%
 

Minimum 5 values

Value Count Frequency (%)  
-2.505 1 0.0%
 
-2.494 1 0.0%
 
-2.49 3 0.0%
 
-2.489 2 0.0%
 
-2.487 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.496 1 0.0%
 
2.497 1 0.0%
 
2.499 1 0.0%
 
2.502 1 0.0%
 
2.503 1 0.0%
 

PRI_lep_phi
Numeric

Distinct count 6285
Unique (%) 2.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.043543
Minimum -3.142
Maximum 3.142
Zeros (%) 0.0%

Quantile statistics

Minimum -3.142
5-th percentile -2.834
Q1 -1.522
Median 0.086
Q3 1.618
95-th percentile 2.838
Maximum 3.142
Range 6.284
Interquartile range 3.14

Descriptive statistics

Standard deviation 1.8166
Coef of variation 41.72
Kurtosis -1.1901
Mean 0.043543
MAD 1.5694
Skewness -0.045746
Sum 10886
Variance 3.3001
Memory size 1.9 MiB
Value Count Frequency (%)  
3.05 64 0.0%
 
1.9169999999999998 64 0.0%
 
1.6059999999999999 64 0.0%
 
-0.171 63 0.0%
 
2.189 62 0.0%
 
0.207 62 0.0%
 
0.687 62 0.0%
 
0.086 62 0.0%
 
0.948 62 0.0%
 
1.7619999999999998 62 0.0%
 
Other values (6275) 249373 99.7%
 

Minimum 5 values

Value Count Frequency (%)  
-3.142 9 0.0%
 
-3.141 44 0.0%
 
-3.14 41 0.0%
 
-3.139 40 0.0%
 
-3.138 40 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.138 34 0.0%
 
3.139 42 0.0%
 
3.14 36 0.0%
 
3.141 40 0.0%
 
3.142 2 0.0%
 

PRI_lep_pt
Numeric

Distinct count 61929
Unique (%) 24.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 46.66
Minimum 26
Maximum 560.27
Zeros (%) 0.0%

Quantile statistics

Minimum 26
5-th percentile 27.213
Q1 32.375
Median 40.516
Q3 53.39
95-th percentile 86.647
Maximum 560.27
Range 534.27
Interquartile range 21.015

Descriptive statistics

Standard deviation 22.065
Coef of variation 0.47289
Kurtosis 21.583
Mean 46.66
MAD 14.882
Skewness 3.2408
Sum 11665000
Variance 486.86
Memory size 1.9 MiB
Value Count Frequency (%)  
30.037 22 0.0%
 
28.855 22 0.0%
 
28.58 22 0.0%
 
28.329 22 0.0%
 
34.902 21 0.0%
 
32.806 21 0.0%
 
30.416999999999998 20 0.0%
 
30.987 20 0.0%
 
32.924 20 0.0%
 
28.815 20 0.0%
 
Other values (61919) 249790 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
26.0 2 0.0%
 
26.000999999999998 13 0.0%
 
26.002 15 0.0%
 
26.003 12 0.0%
 
26.004 6 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
437.707 1 0.0%
 
447.87800000000004 1 0.0%
 
452.434 1 0.0%
 
461.89599999999996 1 0.0%
 
560.271 1 0.0%
 

PRI_met
Numeric

Distinct count 87836
Unique (%) 35.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 41.717
Minimum 0.109
Maximum 2842.6
Zeros (%) 0.0%

Quantile statistics

Minimum 0.109
5-th percentile 8.4299
Q1 21.398
Median 34.802
Q3 51.895
95-th percentile 99.891
Maximum 2842.6
Range 2842.5
Interquartile range 30.497

Descriptive statistics

Standard deviation 32.895
Coef of variation 0.78852
Kurtosis 227.34
Mean 41.717
MAD 21.786
Skewness 5.2708
Sum 10429000
Variance 1082.1
Memory size 1.9 MiB
Value Count Frequency (%)  
31.252 15 0.0%
 
31.701 14 0.0%
 
30.498 13 0.0%
 
29.685 13 0.0%
 
33.951 13 0.0%
 
32.442 13 0.0%
 
25.933000000000003 13 0.0%
 
26.642 13 0.0%
 
27.022 13 0.0%
 
33.143 13 0.0%
 
Other values (87826) 249867 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.109 1 0.0%
 
0.155 1 0.0%
 
0.162 1 0.0%
 
0.179 1 0.0%
 
0.2 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
551.06 1 0.0%
 
593.237 1 0.0%
 
695.533 1 0.0%
 
951.363 1 0.0%
 
2842.617 1 0.0%
 

PRI_met_log
Numeric

Distinct count 87836
Unique (%) 35.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.4751
Minimum -2.2164
Maximum 7.9525
Zeros (%) 0.0%

Quantile statistics

Minimum -2.2164
5-th percentile 2.1318
Q1 3.0633
Median 3.5497
Q3 3.9492
95-th percentile 4.6041
Maximum 7.9525
Range 10.169
Interquartile range 0.88592

Descriptive statistics

Standard deviation 0.75452
Coef of variation 0.21712
Kurtosis 1.528
Mean 3.4751
MAD 0.57295
Skewness -0.6258
Sum 868780
Variance 0.5693
Memory size 1.9 MiB
Value Count Frequency (%)  
3.442083374134498 15 0.0%
 
3.4563482261270035 14 0.0%
 
3.8526975393433314 13 0.0%
 
3.4176611076928203 13 0.0%
 
2.72831024010957 13 0.0%
 
3.2966513490377674 13 0.0%
 
3.3906418677226386 13 0.0%
 
2.8927023732344472 13 0.0%
 
3.342437686455233 13 0.0%
 
3.2555162889631926 13 0.0%
 
Other values (87826) 249867 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
-2.2164073967529934 1 0.0%
 
-1.8643301620628905 1 0.0%
 
-1.820158943749753 1 0.0%
 
-1.720369473141382 1 0.0%
 
-1.6094379124341003 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
6.311843696146471 1 0.0%
 
6.38559398188572 1 0.0%
 
6.544678458117106 1 0.0%
 
6.857895693185185 1 0.0%
 
7.952480385727471 1 0.0%
 

PRI_met_phi
Numeric

Distinct count 6285
Unique (%) 2.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.010119
Minimum -3.142
Maximum 3.142
Zeros (%) 0.0%

Quantile statistics

Minimum -3.142
5-th percentile -2.831
Q1 -1.575
Median -0.024
Q3 1.561
95-th percentile 2.823
Maximum 3.142
Range 6.284
Interquartile range 3.136

Descriptive statistics

Standard deviation 1.8122
Coef of variation -179.09
Kurtosis -1.1961
Mean -0.010119
MAD 1.5686
Skewness 0.0079078
Sum -2529.8
Variance 3.2842
Memory size 1.9 MiB
Value Count Frequency (%)  
-2.0909999999999997 66 0.0%
 
-2.063 65 0.0%
 
-2.911 64 0.0%
 
1.6 64 0.0%
 
0.516 63 0.0%
 
2.4090000000000003 61 0.0%
 
0.9309999999999999 61 0.0%
 
0.263 61 0.0%
 
-3.113 61 0.0%
 
-0.642 60 0.0%
 
Other values (6275) 249374 99.7%
 

Minimum 5 values

Value Count Frequency (%)  
-3.142 3 0.0%
 
-3.141 35 0.0%
 
-3.14 30 0.0%
 
-3.139 43 0.0%
 
-3.138 43 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.138 47 0.0%
 
3.139 26 0.0%
 
3.14 37 0.0%
 
3.141 41 0.0%
 
3.142 6 0.0%
 

PRI_met_sumet
Highly correlated

This variable is highly correlated with DER_sum_pt and should be ignored for analysis

Correlation 0.90448

PRI_met_sumet_log
Highly correlated

This variable is highly correlated with PRI_met_sumet and should be ignored for analysis

Correlation 0.92121

PRI_tau_eta
Numeric

Distinct count 4971
Unique (%) 2.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.010973
Minimum -2.499
Maximum 2.497
Zeros (%) 0.0%

Quantile statistics

Minimum -2.499
5-th percentile -2.011
Q1 -0.925
Median -0.023
Q3 0.898
95-th percentile 2.011
Maximum 2.497
Range 4.996
Interquartile range 1.823

Descriptive statistics

Standard deviation 1.2141
Coef of variation -110.64
Kurtosis -0.84229
Mean -0.010973
MAD 1.0123
Skewness 0.017852
Sum -2743.3
Variance 1.474
Memory size 1.9 MiB
Value Count Frequency (%)  
0.152 141 0.1%
 
-0.152 131 0.1%
 
-0.301 129 0.1%
 
-0.899 124 0.0%
 
0.3 122 0.0%
 
0.301 122 0.0%
 
-0.898 121 0.0%
 
0.9009999999999999 121 0.0%
 
-0.9009999999999999 121 0.0%
 
0.899 119 0.0%
 
Other values (4961) 248749 99.5%
 

Minimum 5 values

Value Count Frequency (%)  
-2.499 1 0.0%
 
-2.498 3 0.0%
 
-2.497 1 0.0%
 
-2.496 2 0.0%
 
-2.495 3 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.492 4 0.0%
 
2.4930000000000003 1 0.0%
 
2.494 2 0.0%
 
2.495 1 0.0%
 
2.497 2 0.0%
 

PRI_tau_phi
Numeric

Distinct count 6285
Unique (%) 2.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.0081711
Minimum -3.142
Maximum 3.142
Zeros (%) 0.0%

Quantile statistics

Minimum -3.142
5-th percentile -2.829
Q1 -1.575
Median -0.033
Q3 1.565
95-th percentile 2.83
Maximum 3.142
Range 6.284
Interquartile range 3.14

Descriptive statistics

Standard deviation 1.8168
Coef of variation -222.34
Kurtosis -1.2006
Mean -0.0081711
MAD 1.5736
Skewness 0.013872
Sum -2042.8
Variance 3.3006
Memory size 1.9 MiB
Value Count Frequency (%)  
-2.988 72 0.0%
 
-1.235 69 0.0%
 
-2.017 66 0.0%
 
-0.542 66 0.0%
 
-0.54 65 0.0%
 
-0.536 64 0.0%
 
-1.0290000000000001 64 0.0%
 
1.215 63 0.0%
 
-1.247 62 0.0%
 
2.594 62 0.0%
 
Other values (6275) 249347 99.7%
 

Minimum 5 values

Value Count Frequency (%)  
-3.142 4 0.0%
 
-3.141 33 0.0%
 
-3.14 36 0.0%
 
-3.139 43 0.0%
 
-3.138 32 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.138 35 0.0%
 
3.139 21 0.0%
 
3.14 29 0.0%
 
3.141 35 0.0%
 
3.142 5 0.0%
 

PRI_tau_pt
Numeric

Distinct count 59639
Unique (%) 23.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 38.707
Minimum 20
Maximum 764.41
Zeros (%) 0.0%

Quantile statistics

Minimum 20
5-th percentile 20.787
Q1 24.592
Median 31.804
Q3 45.017
95-th percentile 77.882
Maximum 764.41
Range 744.41
Interquartile range 20.425

Descriptive statistics

Standard deviation 22.412
Coef of variation 0.57901
Kurtosis 30.512
Mean 38.707
MAD 14.799
Skewness 3.7552
Sum 9676900
Variance 502.3
Memory size 1.9 MiB
Value Count Frequency (%)  
21.134 32 0.0%
 
20.059 30 0.0%
 
21.219 29 0.0%
 
20.048 29 0.0%
 
22.256999999999998 29 0.0%
 
20.195 29 0.0%
 
20.875999999999998 28 0.0%
 
21.575 28 0.0%
 
20.660999999999998 28 0.0%
 
23.138 27 0.0%
 
Other values (59629) 249711 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
20.0 17 0.0%
 
20.000999999999998 15 0.0%
 
20.002 15 0.0%
 
20.003 17 0.0%
 
20.004 19 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
415.985 1 0.0%
 
449.648 1 0.0%
 
505.06 1 0.0%
 
622.862 1 0.0%
 
764.408 1 0.0%
 

Weight
Numeric

Distinct count 104096
Unique (%) 41.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.6468
Minimum 0.0015019
Maximum 7.8225
Zeros (%) 0.0%

Quantile statistics

Minimum 0.0015019
5-th percentile 0.0015027
Q1 0.018636
Median 1.1562
Q3 2.4041
95-th percentile 5.348
Maximum 7.8225
Range 7.821
Interquartile range 2.3855

Descriptive statistics

Standard deviation 1.8751
Coef of variation 1.1387
Kurtosis -0.1545
Mean 1.6468
MAD 1.5264
Skewness 0.99063
Sum 411690
Variance 3.516
Memory size 1.9 MiB
Value Count Frequency (%)  
0.0015027048310099999 38552 15.4%
 
0.018636116672000002 32352 12.9%
 
1.68161144262 17259 6.9%
 
0.7440562472300001 10351 4.1%
 
0.0026533113373299996 7789 3.1%
 
0.00150187015894 6974 2.8%
 
1.4548484726800002 6818 2.7%
 
0.07389912981499999 5051 2.0%
 
0.0713571365583 4657 1.9%
 
0.309795155685 4013 1.6%
 
Other values (104086) 116184 46.5%
 

Minimum 5 values

Value Count Frequency (%)  
0.00150187015894 6974 2.8%
 
0.0015027048310099999 38552 15.4%
 
0.0026533113373299996 7789 3.1%
 
0.018636116672000002 32352 12.9%
 
0.0640607773058 2998 1.2%
 

Maximum 5 values

Value Count Frequency (%)  
7.769833369520001 1 0.0%
 
7.805034958110001 1 0.0%
 
7.817382808239999 1 0.0%
 
7.821960707020001 1 0.0%
 
7.82254254503 1 0.0%
 

Correlations

Sample

EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot DER_sum_pt DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality PRI_tau_pt PRI_tau_eta PRI_tau_phi PRI_lep_pt PRI_lep_eta PRI_lep_phi PRI_met PRI_met_phi PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt Weight Label PRI_met_log DER_mass_vis_log DER_pt_ratio_lep_tau_log PRI_met_sumet_log
0 100000 138.470000 51.655000 97.827000 27.980000 0.910000 124.711000 2.666000 3.064000 41.928000 197.760000 1.582000 1.396000 0.200000 32.638000 1.017000 0.381000 51.626000 2.273000 -2.414000 16.824000 -0.277000 258.733000 2 67.435000 2.150000 0.444000 46.062000 1.240000 -2.475000 113.497000 0.002653 s 2.822806 4.583201 0.458690 5.555797
1 100001 160.937000 68.768000 103.235000 48.146000 -999.000000 -999.000000 -999.000000 3.473000 2.078000 125.157000 0.879000 1.414000 -999.000000 42.014000 2.039000 -3.011000 36.918000 0.501000 0.103000 44.704000 -1.916000 164.546000 1 46.226000 0.725000 1.158000 -999.000000 -999.000000 -999.000000 46.226000 2.233584 b 3.800063 4.637008 -0.128970 5.103190
2 100002 -999.000000 162.172000 125.953000 35.635000 -999.000000 -999.000000 -999.000000 3.148000 9.336000 197.814000 3.776000 1.414000 -999.000000 32.154000 -0.705000 -2.093000 121.409000 -0.953000 1.052000 54.283000 -2.186000 260.414000 1 44.251000 2.053000 -2.028000 -999.000000 -999.000000 -999.000000 44.251000 2.347389 b 3.994211 4.835909 1.328665 5.562273
3 100003 143.905000 81.417000 80.943000 0.414000 -999.000000 -999.000000 -999.000000 3.310000 0.414000 75.968000 2.354000 -1.285000 -999.000000 22.647000 -1.655000 0.010000 53.321000 -0.522000 -3.100000 31.082000 0.060000 86.062000 0 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 -0.000000 5.446378 b 3.436629 4.393745 0.856116 4.455068
4 100004 175.864000 16.915000 134.805000 16.405000 -999.000000 -999.000000 -999.000000 3.891000 16.405000 57.983000 1.056000 -1.385000 -999.000000 28.209000 -2.197000 -2.231000 29.774000 0.798000 1.569000 2.723000 -0.871000 53.131000 0 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 0.000000 6.245333 b 1.001734 4.903829 0.054488 3.972761
In [23]:
#histogram without classes for all the columns of train
hist = boson_dataset[boson_dataset.columns.difference(['EventId', 'Label','Weight'])].hist(figsize=(30,30))

# the entire dataset including the test
#result_df = pd.DataFrame(boson_dataset.append(boson_dataset_test))
#hist = result_df[result_df.columns.difference(['EventId', 'Label',''])].hist(figsize=(30,30))
In [24]:
boson_dataset_s = boson_dataset[boson_dataset.Label=='s']
boson_dataset_b = boson_dataset[boson_dataset.Label=='b']
print('Total: ' + str(boson_dataset.size))
print('Class S: ' + str(boson_dataset_s.size))
print('Class B: '+ str(boson_dataset_b.size))
Total: 8250000
Class S: 2827011
Class B: 5422989
In [26]:
hist_s = boson_dataset_s[boson_dataset_s.columns.difference(['EventId', 'Label','Weight'])].hist(figsize=(30,30))
In [25]:
hist_b = boson_dataset_b[boson_dataset_b.columns.difference(['EventId', 'Label','Weight'])].hist(figsize=(30,30))
In [195]:
for i, col in enumerate(boson_dataset.columns.difference(['EventId', 'Label','Weight'])):
    plt.figure(i)
    sns.distplot(boson_dataset[boson_dataset.Label=='s'][col])
    sns.distplot(boson_dataset[boson_dataset.Label=='b'][col])

Summary Q2 :

  • Looking at the above histograms , none of the variables can be said to have clear distinction between values
  • Each variable has a decent overlap with respect to the classes
  • Even the log values wont help in distinguising them
  • We can see that the ranges for DER_mass_transverse_met_lep are distinguishable however there is a big overlap there too

Question 3

In [38]:
iris_dataset.head()
Out[38]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.100000 3.500000 1.400000 0.200000 Iris-setosa
1 2 4.900000 3.000000 1.400000 0.200000 Iris-setosa
2 3 4.700000 3.200000 1.300000 0.200000 Iris-setosa
3 4 4.600000 3.100000 1.500000 0.200000 Iris-setosa
4 5 5.000000 3.600000 1.400000 0.200000 Iris-setosa
In [40]:
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalLengthCm", "SepalWidthCm").add_legend())
g = sns.FacetGrid(iris_dataset,  hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalLengthCm", "PetalWidthCm").add_legend())
g = sns.FacetGrid(iris_dataset,  hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalWidthCm", "PetalLengthCm").add_legend())
g = sns.FacetGrid(iris_dataset,  hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalWidthCm", "PetalWidthCm").add_legend())
g = sns.FacetGrid(iris_dataset,  hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalLengthCm", "PetalLengthCm").add_legend())
g = sns.FacetGrid(iris_dataset,  hue="Species",height=(6))
g = (g.map(plt.scatter, "PetalLengthCm", "PetalWidthCm").add_legend())

Summary Q3 : Looking at the above scatter plots its evident that the classes are well seperated to the best extent in the below combination of scatter plot as it shows maximum seperation amongth the species:

  • Petal WidthCm and PetalLengthCm
  • There are other two as well from the bottom up , however there is overlap between the points there

Question 4 :

In [35]:
##This block considers all variables and can be used iteratively to minimize re-running code
#We can limit the variables of interest if we want to
def Plot(varname,dataset):   
    f, (ax1,ax2) = plt.subplots(1, 2, figsize=(20,4))
    
    ax1.set_title('Before Log for var:' + varname)
    ax2.set_title('After Log for var:' + varname)
    
    sns.distplot(dataset[varname].fillna(0),ax=ax1,fit=norm)   
    sns.distplot((np.log(dataset[varname]).fillna(0)),ax=ax2,fit=norm)
    
    plt.show()
   


Plot('PRI_met',boson_dataset)      
Plot('DER_mass_vis',boson_dataset)    
Plot('DER_pt_ratio_lep_tau',boson_dataset)
Plot('PRI_met_sumet',boson_dataset)


boson_dataset['PRI_met_log'] = np.log(boson_dataset['PRI_met'])
boson_dataset['DER_mass_vis_log']= np.log(boson_dataset['DER_mass_vis'])
boson_dataset['DER_pt_ratio_lep_tau_log']= np.log(boson_dataset['DER_pt_ratio_lep_tau'])
boson_dataset['PRI_met_sumet_log']= np.log(boson_dataset['PRI_met_sumet'])
In [53]:
boson_dataset_sample = boson_dataset.groupby('Label').apply(lambda s: s.sample(2000)) #boson_dataset.sample(n=10)
g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met_sumet", "DER_pt_ratio_lep_tau").add_legend())

g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met_sumet_log", "DER_pt_ratio_lep_tau_log").add_legend())

#Additional
g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met", "DER_mass_vis").add_legend())

g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met_log", "DER_mass_vis_log").add_legend())

Summary Q4 :

  • Picked up those columns which after log transformation could be visualized as a normal distribution post log as shown above in individual histograms of variables without and with log.
  • Choose combination of two to plot scatter before and after log.
  • Looking at the above scatter plots before log and after log for the variables "PRI_met_sumet" and "DER_pt_ratio_lep_tau" we see that
    • Looking at the plain scatter plots we cant distinguish the classes much , moreover there doesnt seem to be a relation between the two variables, and they are very closely overlapping
    • Normalizing the data using log has made the data spread out and visually interpretable
    • Looking at the logged scatter plots of the same variables we can see a negative correlation getting established between PRI_met_sumet and DER_pt_ratio_lep_tau

Question 5 : 2D PCA for each digit class

In [70]:
#loading csv
digit_recognizer = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/digit-recognizer/train.csv')
digit_recognizer.shape
Out[70]:
(42000, 785)
In [71]:
digitArray = digit_recognizer.label.unique()
digitArray.sort()
digitArray
Out[71]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [72]:
def twoDPCA(clas):
    # PCA with 2 components.
    pca = PCA(n_components = 2)
    #print(pca)
    #Fitting PCA to the iris dataset and transforming it into 2 principal components
    X, y = digit_recognizer[digit_recognizer.label==clas].iloc[:, 1:].values, digit_recognizer.iloc[:, 0].values
    #Standardize the data first
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_std = sc.fit_transform(X)

    X_proj = pca.fit_transform(X_std)
    return X_proj
In [73]:
def plot2DPCA(digits):
    f, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,4))
    # Plotting the projected principal components and try to understand the data. 
    # c=y colors the scatter plot based on y (target)
    X = twoDPCA(digits[0])
    ax1.set_title('Scatter for 2D PCA for Digit :' + str(digits[0]))
    ax1.scatter(X[:,0], X[:,1])
    if digits.size > 2:
        Y = twoDPCA(digits[1])
        Z = twoDPCA(digits[2])
        ax2.set_title('Scatter for 2D PCA for Digit :' + str(digits[1]))
        ax3.set_title('Scatter for 2D PCA for Digit :' + str(digits[2]))
        ax2.scatter(Y[:,0], Y[:,1])
        ax3.scatter(Z[:,0], Z[:,1])
    #plt.figure(figsize = (10,8))
    #plt.colorbar()
    plt.show()
In [74]:
for x in range(0,digitArray.size,3):
    #print(x)
    y=x+3
    plot2DPCA(digitArray[x:y:])

Question 5 Summary :

* Looking at the above scatter plots of 2D PCA for digits we can see 
* 1, 4 ,7 , 9 are the digits that have more spread as compared to the other digits 
* We can observe this by looking at the range of the values of the PCA components
* We can deduce that there is a higher spread amongst the numbers that can be written in multiple ways
* People write these digits in different ways

Question 6

In [75]:
pca = PCA(n_components = 30)
X = digit_recognizer.iloc[:, 1:785].values
DIGITS_PCA_30_dataset_array = pca.fit_transform(X)
DIGITS_PCA_30_dataset =pd.DataFrame(DIGITS_PCA_30_dataset_array)
DIGITS_PCA_30_dataset.head()
Out[75]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
0 -661.595780 -699.311329 183.282042 120.611398 -81.081824 489.461847 -683.470895 85.559497 348.548292 202.980240 -364.553391 21.264995 404.477151 -97.048964 61.841956 -86.806832 17.566938 285.478027 18.489390 207.299992 44.105701 222.279285 56.833039 -150.908486 13.615222 -39.822421 -329.048320 209.887501 53.774265 85.084384
1 1701.451685 -360.551556 -501.805593 335.423654 -442.378931 738.404042 653.875432 -176.600386 -7.520126 67.845959 34.221877 46.551135 -70.435414 -342.688616 377.844994 -5.674843 317.738249 87.597697 -94.560900 -175.011465 -213.072028 -272.313867 6.882494 -22.581181 -34.636878 264.347635 -75.546643 14.564312 -83.783591 -89.918589
2 -886.894434 -293.765783 67.155311 78.263766 -473.715929 -323.540652 437.799060 -305.377773 -195.295404 -25.625397 367.599003 252.106279 54.574054 -58.590929 163.088229 -162.499168 -100.321591 -155.356230 70.093155 -182.973527 -104.998611 128.702540 152.083811 51.397146 -113.140441 89.196840 -228.414856 107.748696 -10.485223 123.854704
3 -165.755602 300.182762 -64.145486 759.706252 -425.844359 157.390367 -304.099073 276.409517 -45.867386 -295.481490 -2.753945 -256.887521 -87.784179 -175.353144 40.048777 -87.516544 54.449475 199.835489 -11.539386 298.295407 -232.006594 -90.413399 293.350033 128.015966 -67.282002 -195.306691 -189.473525 82.536451 221.288171 196.478324
4 1923.709716 -449.153070 -548.613023 188.555150 -651.736270 990.063828 564.507103 -255.915507 124.914695 177.567891 -19.491802 333.725465 -213.053366 -354.650122 93.474844 26.795321 232.924581 -54.718908 -45.238239 -256.050012 -15.475114 -131.242623 -140.946481 -56.700300 175.366804 -8.765334 50.303635 -164.771254 -67.505413 17.474002
In [125]:
DIGITS_PCA_30_dataset['label']=digit_recognizer['label']
print(DIGITS_PCA_30_dataset.shape)
DIGITS_PCA_30_dataset.head()
(42000, 31)
Out[125]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 label
0 -661.595780 -699.311329 183.282042 120.611398 -81.081824 489.461847 -683.470895 85.559497 348.548292 202.980240 -364.553391 21.264995 404.477151 -97.048964 61.841956 -86.806832 17.566938 285.478027 18.489390 207.299992 44.105701 222.279285 56.833039 -150.908486 13.615222 -39.822421 -329.048320 209.887501 53.774265 85.084384 1
1 1701.451685 -360.551556 -501.805593 335.423654 -442.378931 738.404042 653.875432 -176.600386 -7.520126 67.845959 34.221877 46.551135 -70.435414 -342.688616 377.844994 -5.674843 317.738249 87.597697 -94.560900 -175.011465 -213.072028 -272.313867 6.882494 -22.581181 -34.636878 264.347635 -75.546643 14.564312 -83.783591 -89.918589 0
2 -886.894434 -293.765783 67.155311 78.263766 -473.715929 -323.540652 437.799060 -305.377773 -195.295404 -25.625397 367.599003 252.106279 54.574054 -58.590929 163.088229 -162.499168 -100.321591 -155.356230 70.093155 -182.973527 -104.998611 128.702540 152.083811 51.397146 -113.140441 89.196840 -228.414856 107.748696 -10.485223 123.854704 1
3 -165.755602 300.182762 -64.145486 759.706252 -425.844359 157.390367 -304.099073 276.409517 -45.867386 -295.481490 -2.753945 -256.887521 -87.784179 -175.353144 40.048777 -87.516544 54.449475 199.835489 -11.539386 298.295407 -232.006594 -90.413399 293.350033 128.015966 -67.282002 -195.306691 -189.473525 82.536451 221.288171 196.478324 4
4 1923.709716 -449.153070 -548.613023 188.555150 -651.736270 990.063828 564.507103 -255.915507 124.914695 177.567891 -19.491802 333.725465 -213.053366 -354.650122 93.474844 26.795321 232.924581 -54.718908 -45.238239 -256.050012 -15.475114 -131.242623 -140.946481 -56.700300 175.366804 -8.765334 50.303635 -164.771254 -67.505413 17.474002 0
In [131]:
cluster_centers_class0 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==0].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class1 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==1].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class2 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==2].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class3 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==3].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class4 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==4].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class5 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==5].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class6 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==6].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class7 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==7].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class8 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==8].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class9 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==9].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class0.head(10)
Out[131]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
0 947.714147 -241.117585 -276.651909 -164.271022 358.450078 -623.386981 120.255554 213.249644 -151.484938 -217.194960 -153.690008 -746.294288 0.392494 -643.950065 324.863905 105.177959 54.993093 63.001149 -29.029979 -421.024782 -46.797382 -73.372633 157.372604 34.601213 -46.919940 -29.069742 58.820646 110.170843 -181.334687 96.400874
1 892.724401 -227.117228 366.709841 -296.076736 -864.594265 -447.487888 -189.315261 -216.877476 -273.944311 274.110988 -111.994871 47.693260 -291.542325 407.879375 -16.560997 -267.320933 -191.918727 94.493796 248.629034 -13.922736 249.338930 38.731225 -142.432136 177.787839 248.477887 148.567440 211.392941 322.176976 146.838993 64.231047
2 953.747885 -182.647088 405.948511 44.189688 128.215920 -850.428660 562.533969 339.716086 352.122094 -158.456284 -497.796919 -391.096326 1.887902 -466.450363 299.357853 -272.948311 -1.053956 73.067006 -114.459454 -257.422169 -62.119460 85.286100 228.414495 255.287583 102.761921 119.702934 147.251590 -33.353525 9.892319 191.835267
3 679.167017 321.280739 237.504405 284.034557 -606.111225 -169.311497 543.699388 -65.457137 340.330863 269.964491 -412.085545 161.873325 409.234806 94.890698 -337.128670 -166.504060 98.153946 -123.849749 -461.199893 -100.286838 -215.061878 -181.424115 -465.788081 169.967503 32.481823 -250.945527 41.076895 33.263503 222.716373 105.693883
4 970.423802 -0.167939 489.131272 -374.440208 -920.072649 -257.749796 155.345690 -178.413534 164.673589 209.768962 -258.847656 68.188904 -259.185927 274.261148 -435.935692 -128.907053 -152.824814 305.584327 82.645530 -387.106206 25.530307 -206.204084 -130.271614 100.227668 180.664789 -151.983899 -54.080037 142.534916 -12.863664 206.806269
5 1025.779580 -247.771119 661.749767 -83.006888 -1078.052575 -320.655344 14.408930 -147.246333 219.631077 290.898648 -171.020279 215.892937 -97.050007 345.343208 -315.034521 -4.806259 43.360058 -157.919949 80.989053 -21.338615 -17.252624 -373.444898 48.900231 -128.700081 -13.318912 -74.441954 271.323310 236.901966 36.175968 175.485405
6 1084.948320 -136.642710 212.519483 -105.504374 -409.116153 -478.700337 -155.251812 -190.552997 -503.858509 -58.071311 -290.160640 -262.465003 45.206327 -54.162732 585.736764 -160.480027 -34.575323 127.571411 -118.288044 -6.410533 16.038249 55.347073 63.432193 41.293891 61.928386 460.904178 82.669067 233.391962 364.780093 -128.916335
7 1539.068327 146.331724 856.573501 -692.254340 -200.373436 -18.671059 176.275997 624.602079 828.229926 -26.686015 -252.282070 401.026654 107.363000 117.981422 -14.773659 -146.256496 -390.672392 -171.117009 -108.953262 -123.409165 88.071994 -220.680189 -7.726764 -30.723416 -125.017273 49.241258 170.635791 151.174137 -218.916265 -66.585026
8 1312.054961 -426.413626 20.958925 -346.814091 -282.754826 -153.682302 169.170536 -454.741615 46.976695 -223.889405 -351.634266 -415.658386 -349.327530 -728.627174 35.727738 -54.631543 151.928100 -234.468355 -65.089419 -56.776669 71.218199 183.062468 352.340275 112.151990 208.945094 99.667583 -190.187510 -204.119492 157.731306 -172.583085
9 853.252388 -358.873760 571.565692 -206.495907 -901.560432 -161.733612 -234.559537 -101.872965 -296.924871 316.403210 -114.290460 131.169254 -155.627317 715.033142 -153.293891 -201.550763 71.807271 -100.811442 212.420161 174.422955 62.706495 -98.290904 22.548092 112.480919 328.919519 -42.581412 83.429604 435.443834 138.654737 187.514433
In [132]:
#picking one cluster center from each class
cluster_centers_allclasses = DIGITS_PCA_30_dataset.groupby('label').apply(lambda x: x.sample(1)).reset_index(drop=True)
cluster_centers_allclasses.drop(['label'], axis=1,inplace=True)
cluster_centers_allclasses.head()
Out[132]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
0 1027.752065 -244.678145 465.468845 -527.612439 -814.814163 -320.928767 -373.776641 5.892921 -72.928350 164.506104 0.846011 -109.744066 -219.784001 182.798499 291.380213 -242.206684 -257.948816 -106.107909 227.706126 -28.090817 217.595364 -42.730436 159.831140 145.012091 63.234263 184.377339 209.268775 101.526801 305.226537 -170.874249
1 -884.076670 -439.985587 57.927437 -72.046714 -341.017626 -400.798958 410.463443 -259.024020 26.948038 162.451639 475.257808 243.017425 -48.897723 -134.756211 307.622866 -129.860934 -94.760297 -216.370235 -87.326774 -43.307335 -75.861337 65.388997 151.577595 19.010485 -114.129509 -115.347839 -28.683085 -48.040134 -2.359464 256.514934
2 727.932539 -76.990688 984.075786 -155.116294 -79.332911 355.758493 345.811429 88.288817 -264.100535 505.926207 439.332662 -351.916326 -182.982404 -244.997599 -124.681894 -64.757306 -393.506053 -7.960393 -295.602495 81.768283 133.207832 179.198096 272.904681 -364.562463 -45.727567 42.628349 -21.909418 134.047476 -343.540370 -314.292465
3 -270.731462 -137.761953 -527.312647 -218.644504 668.026240 244.705156 35.418742 72.747732 -227.567966 590.507901 57.532947 108.541570 -212.535653 325.987549 463.969679 22.380771 -147.791566 314.094441 248.194372 153.268533 -329.141577 -22.089079 -257.199419 -122.974593 123.708121 65.945157 45.062804 -30.353133 -21.039216 -75.827219
4 182.037623 946.997780 342.315656 827.243798 -230.147281 383.199303 -91.828932 103.421079 -6.930702 -149.524495 704.118567 264.463570 92.404438 251.224135 349.890751 -19.404172 8.658991 98.809489 26.388725 124.174156 -111.929443 115.415509 -225.093791 -102.336807 242.157418 12.097886 55.293899 -109.351479 -237.601500 48.974202
In [163]:
def checkPurity(y_rand, y_pred_rand):
    mat_rand= metrics.confusion_matrix(y_pred_rand, y_rand)
    #print(mat_rand)
    #print(metrics.accuracy_score(y, y_pred_rand))
    maxmat_rand = mat_rand.max(axis=1)
    #print(mat_rand.max(axis=1))
    #print(maxmat_rand.sum())
    purity = maxmat_rand.sum()/mat_rand.sum()
    return round(purity,3)
In [135]:
def runKmeanswithInitData(df_init):    
    ktest = KMeans(n_clusters=10,init=df_init,max_iter=200,tol=0.0001)
    
    X,y = DIGITS_PCA_30_dataset.iloc[:, 0:30],DIGITS_PCA_30_dataset.iloc[:, 30]
    # fitting the model to X 
    fitted = ktest.fit(X)
    # predicting labels (y) and saving to y_pred
    y_pred = ktest.predict(X)
    return y,y_pred,fitted.n_iter_


def RunAll():
    clusts_inits = []
    iters_inits = []
    purity_inits = []

    print("Clusters with Inits for all different Initializations")
   
    clusts_inits.append(0)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class0)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    clusts_inits.append(1)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class1)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    
    clusts_inits.append(2)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class2)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    
    clusts_inits.append(3)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class3)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    
    clusts_inits.append(4)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class4)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    
    clusts_inits.append(5)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class5)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    
    clusts_inits.append(6)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class6)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    clusts_inits.append(7)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class7)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    
    clusts_inits.append(8)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class8)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    clusts_inits.append(9)
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class9)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    clusts_inits.append('All')
    y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_allclasses)
    iters_inits.append(n_iter_)
    pureVal_inits = checkPurity(y,y_pred)
    purity_inits.append(pureVal_inits)
    
    data = pd.DataFrame()
    data['initialization_method_class'] = clusts_inits
    data['iterations'] = iters_inits
    data['purity_inits'] = purity_inits
    return data
df = RunAll()  
df
Clusters with Inits for all different Initializations
Out[135]:
initialization_method_class iterations purity_inits
0 0 44 0.589000
1 1 105 0.595000
2 2 173 0.595000
3 3 32 0.598000
4 4 105 0.579000
5 5 37 0.527000
6 6 59 0.595000
7 7 122 0.632000
8 8 70 0.589000
9 9 44 0.594000
10 All 73 0.632000

Question 6A Summary :

  • Looking at the above table output we can see that we have the best purity score when the clusters are initialized from different classes : (0.632) a.k.a each individual class center as the 10 cluster centers
  • the All class run also took 73 iterations to converge at the given tolerance stopping criteria.
  • We also have a decent purity 0.598 when the centers are initialized from class 3 , with convergence at just 32 iterations

Question 6B :

In [161]:
def runKmeansRandomInit(n):
    # number of clusters to predict 3
    
    ktest_rand = KMeans(n_clusters=n,random_state=0)
    X_rand,y_rand = DIGITS_PCA_30_dataset.iloc[:, 0:30],DIGITS_PCA_30_dataset.iloc[:, 30]
    # fitting the model to X 
    fitted_rand = ktest_rand.fit(X_rand)
    # predicting labels (y) and saving to y_pred
    y_pred_rand = ktest_rand.predict(X_rand)
    return y_rand,y_pred_rand,fitted_rand.n_iter_
In [164]:
clusts = []
purity = []
for noofClusters in range(5,30,5):
    print("Cluster")
    print(noofClusters)
    clusts.append(noofClusters)
    y_rand,y_pred_rand,iters = runKmeansRandomInit(noofClusters)
    #print(iters)
    pureVal = checkPurity(y_rand,y_pred_rand)
    purity.append(pureVal)
Cluster
5
Cluster
10
Cluster
15
Cluster
20
Cluster
25
In [166]:
data = pd.DataFrame()
data['clusters'] = clusts
data['purity'] = purity
ax = sns.scatterplot(x="clusters", y="purity", data=data)

Question 6B Interpretation

  • Looking at above plot for different cluster sizes we can observe that as we increase the number of clusters the purity increases

Question 7 :

In [167]:
#We will use the data without the labels as input for GMM
iris_dataset.iloc[:, 1:5].head()
Out[167]:
SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
0 5.100000 3.500000 1.400000 0.200000
1 4.900000 3.000000 1.400000 0.200000
2 4.700000 3.200000 1.300000 0.200000
3 4.600000 3.100000 1.500000 0.200000
4 5.000000 3.600000 1.400000 0.200000
In [174]:
#Created function to fit the gaussian lAdded comments in the function  

from sklearn.mixture import GaussianMixture

def runGMM(mixture):
    gmm = GaussianMixture(n_components=mixture,random_state=0,covariance_type='full')
    fit = gmm.fit(iris_dataset.iloc[:, 1:5])
    #both the score with the training data passed and the lower_bound_ would return the same log likelihood 
    #values that we can the take exponent of which gives us the likelihood values that we later use for plotting
    #print("score")
    #gmm.score(iris_dataset.iloc[:, 1:5])
    #print("lower bound")
    return gmm.lower_bound_


mixtures = []
likelihood = []

for mix in range(1,33,2):
    #print("Mixtures")
    #print(mix)
    mixtures.append(mix)
    likeliVal =round(np.exp(runGMM(mix)),2)
    likelihood.append(likeliVal)
    #print("Likeli Val")
    #print(likeliVal)
    
data = pd.DataFrame()
data['mixtures'] = mixtures
data['likelihood'] = likelihood
ax = sns.scatterplot(x="mixtures", y="likelihood", data=data)    
ax.set(xticks=np.arange(1,35,2))
Out[174]:
[[<matplotlib.axis.XTick at 0x1a2ae9e550>,
  <matplotlib.axis.XTick at 0x1a1c8b7a58>,
  <matplotlib.axis.XTick at 0x1a1d591668>,
  <matplotlib.axis.XTick at 0x1a29f93dd8>,
  <matplotlib.axis.XTick at 0x1a29f93fd0>,
  <matplotlib.axis.XTick at 0x1a29f93b38>,
  <matplotlib.axis.XTick at 0x1a2ae85710>,
  <matplotlib.axis.XTick at 0x1a2ae85f28>,
  <matplotlib.axis.XTick at 0x1a2ae85470>,
  <matplotlib.axis.XTick at 0x1a1f5afb70>,
  <matplotlib.axis.XTick at 0x1a1d886da0>,
  <matplotlib.axis.XTick at 0x1a2ae85cc0>,
  <matplotlib.axis.XTick at 0x1a1d88d208>,
  <matplotlib.axis.XTick at 0x1a218d9438>,
  <matplotlib.axis.XTick at 0x1a218d9080>,
  <matplotlib.axis.XTick at 0x1a218d9cf8>,
  <matplotlib.axis.XTick at 0x1a1cd10080>]]

Question 7 Summary:

  • Looking at the above plot we can see the likelihood value against no of mixtures used of gaussian.
  • The likelihood value will increase as we increase the number of mixture components
  • This is expected as we try to increase the likelihood by better fitting the data